##### Georgia WRU package ; Personal Identifying Info script; keep secret 
library(devtools)
library(foreign)
library(rgdal)
library(sp)
library(dplyr)
library(wru)
library(gridExtra)
library(stringi)
library(stringr)
library(tidyverse)
library(data.table)
options(stringsAsFactors = FALSE)


###setting directory ; code commented out, but use for R studio
#ga_wd <- dirname(rstudioapi::getActiveDocumentContext()$path)
#setwd(ga_wd)
#setwd("voterfile")
#list.files()
###read in the geocoded addresses, and the voterfile (which includes addresses)
ga_geocoded <- read.csv("ga_geocoded_all.csv")
ga_voterfile <- read.csv("ga_voterfile.csv")
###let's merge the data now 
ga_voterfile$residence_zipcode <- substr(ga_voterfile$residence_zipcode,1,5)
ga_voterfile$full_addr <- paste0(ga_voterfile$residence_house_number,sep=" ",ga_voterfile$residence_street_name,sep=" ",
                                 ga_voterfile$residence_city,sep=", ", "GA", sep=" ",ga_voterfile$residence_zipcode)
###read in blocks
setwd(ga_wd)
setwd("blocks")
list.files()

####
start_time_blocks <- Sys.time()
ga_blocks <- readOGR(getwd(),"tl_2010_13_tabblock10")

####let's now project everything 
ga_geocoded_coor <- subset(ga_geocoded, select=c(X,Y)) #no addrs missing 
ga_geocoded <- SpatialPointsDataFrame(coords=ga_geocoded_coor,data=ga_geocoded,
                                      proj4string = 
                                        CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))
ga_blocks <- spTransform(ga_blocks, CRS=CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))

###let's overlay 
ga_geocoded$block <- over(ga_geocoded,ga_blocks)$BLOCKCE10
ga_geocoded$tract <- over(ga_geocoded,ga_blocks)$TRACTCE10
ga_geocoded$county <- over(ga_geocoded,ga_blocks)$COUNTYFP10
sum(is.na(ga_geocoded$block))#only 3901 missing 
ga_geocoded_blk <- ga_geocoded@data
saveRDS(ga_geocoded_blk, "ga_geocoded_blk.rds")
rm(ga_geocoded,ga_blocks)
###will now merge on 
ga_voterfile <- merge(ga_voterfile,ga_geocoded_blk,bu="full_addr",all.x=T)
####let's get the blocks predicted first 
census.ga <- get_census_data("b85306550d1fd788ddc045abfa6acf6ba7110abc",state=c("GA"),age=FALSE,sex=FALSE)
ga_voterfile$state <- "ga"
colnames(ga_voterfile)[colnames(ga_voterfile)=="last_name"] <- "surname"
ga_voterfile <- predict_race(ga_voterfile, census.geo = "block", census.data = census.ga,
                                     age=FALSE, sex=FALSE)
sum(is.na(ga_voterfile$pred.whi)) # 343,166 missing; quite a lot 
setwd(ga_wd)
saveRDS(ga_voterfile, "ga_voterfile_geocoded.rds")
end_time_blocks <- Sys.time()

block_time_total <- end_time_blocks - start_time_blocks
time_list <- list("stage 1 time"=block_time_total)
time_list
saveRDS(time_list,"time_list.rds")
####let's rename now 


pred_names <- colnames(ga_voterfile[,grep("pred.", colnames(ga_voterfile))])

for (i in 1:5) {
  colnames(ga_voterfile)[colnames(ga_voterfile)==pred_names[i]] <-
    paste0(pred_names[i],sep="_","blocks")
}
####now for tracts 
ga_voterfile <- predict_race(ga_voterfile, census.geo = "tract", census.data = census.ga,
                             age=FALSE, sex=FALSE)
pred_names <- colnames(ga_voterfile[,grep("pred.", colnames(ga_voterfile))])
for (i in 6:10) {
  colnames(ga_voterfile)[colnames(ga_voterfile)==pred_names[i]] <-
    paste0(pred_names[i],sep="_","tract")
}
####let's do surname next 
ga_voterfile <- predict_race(ga_voterfile, surname.only = TRUE)
pred_names <- colnames(ga_voterfile[,grep("pred.", colnames(ga_voterfile))])
pred_names
for (i in 11:15) {
  colnames(ga_voterfile)[colnames(ga_voterfile)==pred_names[i]] <-
    paste0(pred_names[i],sep="_","surname")
}
###county now 
ga_county_code_df <- ga_voterfile %>% group_by(county,county_code) %>% tally()
ga_county_code_df2 <- ga_county_code_df %>% group_by(county_code) %>% slice(which.max(n))
nrow(ga_county_code_df2)
sum(ga_county_code_df2$n)
(7200399/nrow(ga_voterfile))*100
# so at least 2% are incorrectly coded due to geocoder. good to know 
colnames(ga_voterfile)[colnames(ga_voterfile)=="county"] <- "county_old"
###let's merge on now 
ga_voterfile <- merge(ga_voterfile, ga_county_code_df2, by="county_code")

###now let's do the county level predictions 
ga_voterfile <- predict_race(ga_voterfile, census.geo = "county", census.data = census.ga,
                              age=FALSE, sex=FALSE)
#saveRDS(ga_voterfile, "ga_voterfile_geocoded2.rds")
pred_names <- colnames(ga_voterfile[,grep("pred.", colnames(ga_voterfile))])
pred_names
for (i in 16:21) {
  colnames(ga_voterfile)[colnames(ga_voterfile)==pred_names[i]] <-
    paste0(pred_names[i],sep="_","county")
}



###now let's slim and take out pii in voter file 
names(ga_voterfile)
ga_voterfile2 <- subset(ga_voterfile, select=c(county_code, surname,race,race_desc,gender,block,tract,county,state,
                                               residence_zipcode,
                                              pred.whi_blocks, pred.bla_blocks, pred.his_blocks, pred.asi_blocks,  
                                              pred.oth_blocks,pred.whi_tract,pred.bla_tract, pred.his_tract,
                                              pred.asi_tract, pred.oth_tract,            
                                               pred.whi_surname, pred.bla_surname, pred.his_surname,pred.asi_surname,          
                                               pred.oth_surname,pred.whi_county,pred.bla_county,
                                               pred.his_county, pred.asi_county,pred.oth_county))

saveRDS(ga_voterfile2, "ga_voterfile_geocoded_postpii.rds")



